import io
import os
import sys
import pandas as pd
import contextlib
from IPython.display import display, Javascript
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'NIH', 'indi-image-pilot-20241128')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'NIH')
LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "NIH", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'NIH', 'QC_figures')
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
run_calc_hist_new, show_total_valid_tiles_per_marker_and_batch
from tools.preprocessing_tools.qc_reports.qc_config import NIH_d8_panels, NIH_d8_markers, NIH_d8_marker_info, NIH_d8_cell_lines, NIH_d8_cell_lines_to_cond,\
NIH_d8_cell_lines_for_disp, NIH_d8_reps, NIH_d8_line_colors, NIH_d8_lines_order, NIH_d8_custom_palette,\
NIH_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
# choose batches
batches = [f'batch{i}' for i in range(1,4)]
batches
['batch1', 'batch2', 'batch3']
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches, filename_split='-',site_location=0)
#df['cell_line_cond'] = df['cell_line_cond'].str.replace(" ", "_")
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch3 reading logs of batch2 reading logs of batch1 Total of 3 files were read. Before dup handeling (124974, 21) After duplication removal #1: (124974, 22) After duplication removal #2: (124974, 22)
raws = run_validate_folder_structure(root_directory_raw, False,
NIH_d8_panels,
NIH_d8_markers.copy(),
PLOT_PATH,
NIH_d8_marker_info,
NIH_d8_cell_lines_to_cond,
NIH_d8_reps,
NIH_d8_cell_lines_for_disp,
NIH_d8_expected_dapi_raw,
batches=batches,
fig_width=8, fig_height = 40,
expected_count=25, check_antibody=False)
batch1 Folder structure is valid. No bad files are found. Total Sites: 44000
======== batch2 Folder structure is valid. No bad files are found. Total Sites: 44000
======== batch3 Folder structure is valid. No bad files are found. Total Sites: 44000
======== ====================
## Missing data issue was fixed
differences = (raws[0] != raws[1]).stack()
differences = differences[differences].index.to_frame(index=False)
differences.columns = ["Marker", "Rep", "Condition"]
for condition in differences["Condition"].unique():
print(f"Condition: {condition}")
condition_data = differences[differences["Condition"] == condition]
for rep in condition_data["Rep"].unique():
markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
print(f" Rep: {rep}")
print(f" Markers: {', '.join(markers)}")
procs = run_validate_folder_structure(root_directory_proc, True,
NIH_d8_panels,
NIH_d8_markers,
PLOT_PATH,
NIH_d8_marker_info,
NIH_d8_cell_lines_to_cond,
NIH_d8_reps,
NIH_d8_cell_lines_for_disp,
NIH_d8_expected_dapi_raw,
fig_width=8, fig_height=40,
expected_count=25,
check_antibody=False,
batches=batches)
batch1 Folder structure is valid. No bad files are found. Total Sites: 41546
======== batch2 Folder structure is valid. No bad files are found. Total Sites: 41780
======== batch3 Folder structure is valid. No bad files are found. Total Sites: 41565
======== ====================
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height=40)
batch1
======== batch2
======== batch3
========
for batch in batches[:1]:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc,
batch,
sample_size_per_markers=50,
cond_count=2,
rep_count=len(NIH_d8_reps),
num_markers=len(NIH_d8_markers))
print(f'{batch} var: ',var)
batch1 var: 0.030423036970459477
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,
batches,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18),
vmax=25)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi,
batches,
dapi_filter_by_brenner,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi,
batches,
dapi_filter_by_cellpose,
NIH_d8_line_colors,
NIH_d8_panels,
NIH_d8_reps,
figsize=(6,18))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi,
df_target,
dapi_filter_by_tiling,
NIH_d8_markers,
figsize=(6,18))
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, NIH_d8_markers)
# markers_for_dnls = markers.copy() #TODO need to change according to - if we use all markers or just the d8 ones!!!!
# markers_for_dnls.remove('TIA1')
# markers_for_dnls += ['TDP43B']
total_sum[total_sum.marker.isin(NIH_d8_markers)].n_valid_tiles.sum()
1631945
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
309515.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
987203.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 1760.000000 | 1760.000000 | 1760.000000 | 1.760000e+03 |
| mean | 364.939773 | 3.649398 | 271.837500 | 8.704131e+02 |
| std | 79.588258 | 0.795883 | 73.568335 | 2.132589e+02 |
| min | 19.000000 | 0.190000 | 16.000000 | 4.500000e+01 |
| 25% | 322.000000 | 3.220000 | 224.000000 | 7.367500e+02 |
| 50% | 369.000000 | 3.690000 | 268.000000 | 8.710000e+02 |
| 75% | 421.250000 | 4.212500 | 324.000000 | 1.017000e+03 |
| max | 570.000000 | 5.700000 | 476.000000 | 1.583000e+03 |
| sum | 642294.000000 | NaN | 478434.000000 | 1.531927e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch2 | ||||
| count | 1758.000000 | 1758.000000 | 1758.000000 | 1.758000e+03 |
| mean | 286.395336 | 2.863953 | 202.039249 | 6.480358e+02 |
| std | 68.996169 | 0.689962 | 54.437216 | 1.732157e+02 |
| min | 4.000000 | 0.040000 | 3.000000 | 1.100000e+01 |
| 25% | 249.000000 | 2.490000 | 170.000000 | 5.490000e+02 |
| 50% | 283.500000 | 2.835000 | 197.000000 | 6.370000e+02 |
| 75% | 322.000000 | 3.220000 | 229.000000 | 7.220000e+02 |
| max | 510.000000 | 5.100000 | 383.000000 | 1.279000e+03 |
| sum | 503483.000000 | NaN | 355185.000000 | 1.139247e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch3 | ||||
| count | 1758.000000 | 1758.000000 | 1758.000000 | 1.758000e+03 |
| mean | 276.546075 | 2.765461 | 195.264505 | 6.156018e+02 |
| std | 77.778503 | 0.777785 | 61.463669 | 1.878487e+02 |
| min | 14.000000 | 0.140000 | 7.000000 | 3.300000e+01 |
| 25% | 227.250000 | 2.272500 | 155.000000 | 4.930000e+02 |
| 50% | 274.000000 | 2.740000 | 193.500000 | 6.020000e+02 |
| 75% | 321.000000 | 3.210000 | 226.000000 | 7.097500e+02 |
| max | 504.000000 | 5.040000 | 421.000000 | 1.200000e+03 |
| sum | 486168.000000 | NaN | 343275.000000 | 1.082228e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 5.276000e+03 | 5276.000000 | 5.276000e+03 | 5.276000e+03 |
| mean | 3.093148e+02 | 3.093148 | 2.230656e+02 | 7.114105e+02 |
| std | 8.531182e+01 | 0.853118 | 7.244806e+01 | 2.230372e+02 |
| min | 4.000000e+00 | 0.040000 | 3.000000e+00 | 1.100000e+01 |
| 25% | 2.560000e+02 | 2.560000 | 1.750000e+02 | 5.660000e+02 |
| 50% | 3.020000e+02 | 3.020000 | 2.140000e+02 | 6.830000e+02 |
| 75% | 3.650000e+02 | 3.650000 | 2.640000e+02 | 8.550000e+02 |
| max | 5.700000e+02 | 5.700000 | 4.760000e+02 | 1.583000e+03 |
| sum | 1.631945e+06 | NaN | 1.176894e+06 | 3.753402e+06 |
| expected_count | 4.500000e+02 | 450.000000 | 4.500000e+02 | 4.500000e+02 |
For each batch, cell line, replicate and marker: Total number of tiles
show_total_valid_tiles_per_marker_and_batch(total_sum, vmax=15000)
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap,
extra_index='marker',
vmin=None, vmax=None,
xlabel = 'Total number of tiles',
show_sum=True, figsize=(7,28),
fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap,
extra_index='marker',
vmin=None, vmax=None,
xlabel = 'Total number of whole cells',
show_sum=True,
figsize=(7,28),
fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)',
figsize=(16,6))
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site',
figsize=(16,6))
plot_cell_count(df_no_empty_sites,
NIH_d8_lines_order,
NIH_d8_custom_palette,
y='site_cell_count',
title='Cellpose Cell Count Average per Site',
figsize=(16,6))
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond FUSHeterozygous Untreated 13.855665 FUSHomozygous Untreated 13.848535 FUSRevertant Untreated 13.767027 WT Untreated 11.889007 WT stress 11.968573 Name: n_valid_tiles, dtype: float64
df_dapi[['site_cell_count']].mean()
site_cell_count 30.081144 dtype: float64
plot_catplot(df_dapi,
NIH_d8_custom_palette,
NIH_d8_reps,
x='n_valid_tiles',
x_title='valid tiles count',
batch_min=1,
batch_max=3,
height=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1061: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
plot_hm_of_mean_cell_count_per_tile(df_dapi,
split_by='rep',
rows='cell_line_cond',
columns='panel',
figsize=(18,6))
# for batch in batches:
# print(batch)
# run_calc_hist_new(f'{batch}', dnls_opera_cell_lines_for_disp, dnls_opera_markers,
# root_directory_raw, root_directory_proc,
# hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
# print("="*30)
# save notebook as HTML
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_NIH_NeuronsDay8.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_NIH_NeuronsDay8.html')